The Arsenal Files 8

home *** CD-ROM | disk | FTP | other *** search

/ The Arsenal Files 8 / The Arsenal Files Collection #8 (Arsenal Computer) (1996).ISO / prg_gen / euphor14.zip / GURU.EX < prev next >

Wrap

Text File | 1996-09-05 | 16KB | 664 lines

-- Guru -- usage: -- to search EUPHORIA directories: -- guru word1 word2 word3 ... -- -- to search the current directory: -- cdguru word1 word2 word3 ... -- -- Searches for the best articles that contain the words that you type. -- Each word can contain * and ? wildcard characters. -- The articles are given a score and presented to you sorted by score. -- The scoring system strongly favors articles that contain several of -- your words, rather than just several occurrences of one of your words. -- Some very common words are ignored (see noise_words). -- e.g. -- guru sequence* atom *pend g?r? -- -- Results are displayed on screen and also saved in "c:\guru.out" -- Hints - remember to add * to words that can be pluralized or have -- many different endings. -- - enter an important word twice to double the value of that word without type_check include file.e include wildcard.e include graphics.e include sort.e -------- some user-modifiable parameters: sequence log_name, log_path log_name = "guru.out" log_path = "c:\\" & log_name -- place to store results -- files to skip: sequence skip_list skip_list = { "*.EXE", "*.DLL", "*.LIB", "*.OBJ", "*.SWP", "*.PAR", "*.ZIP", "*.BMP", "*.GIF", "*.JPG" } -- ignore these extremely common words when searching sequence noise_words noise_words = { "a", "an", "the", "to", "and", "of", "is", "or", "by", "as", "in", "you", "are", "be", "if", "?", "*" } constant separator_line = repeat(196, 5) constant MAX_CHUNKS = 20 -- maximum number of chunks to display -- desired size for a chunk of text: constant MIN_CHUNK_SIZE = 10, -- minimum number of lines MAX_CHUNK_SIZE = 20 -- maximum number of lines constant LEFT_HIGHLIGHT = 17, -- highlight markers for matched words RIGHT_HIGHLIGHT = 16 -- (assume LEFT_HIGHLIGHT > RIGHT_HIGHLIGHT) constant HIGHLIGHT_COLOR = BRIGHT_WHITE -------- end of user-modifiable parameters constant KEYB = 0, SCREEN = 1, ERR = 2 constant TRUE = 1, FALSE = 0 constant EOF = -1 type boolean(integer x) return x = 0 or x = 1 end type sequence pos, word_list, word_count, string, orig_string, file_spec boolean wild_string, euphoria integer count_line atom start_time integer log_file constant LINE_WIDTH = 83 function clean(sequence line) -- replace any funny control characters -- and put in \n's to help break up long lines sequence new_line integer c, col new_line = "" col = 1 for i = 1 to length(line) do if col > LINE_WIDTH then new_line = append(new_line, '\n') col = 1 end if c = line[i] col = col + 1 if c < 14 then if c = '\n' then col = 1 elsif c = '\r' then c = ' ' elsif c != '\t' then c = '.' end if end if new_line = append(new_line, c) end for return new_line end function boolean display display = TRUE procedure both_puts(object text) puts(log_file, text) if display then puts(SCREEN, text) end if end procedure procedure both_printf(sequence format, object values) printf(log_file, format, values) if display then printf(SCREEN, format, values) end if end procedure constant MAX_LINE = 100 -- space for largest line sequence buff buff = repeat(0, MAX_LINE) function safe_gets(integer fn) -- Return the next line of text - always with \n on the end. -- Lines are split at MAX_LINE to prevent -- "out of memory" problems on humongous lines -- and to reduce the amount of extraneous output. integer c for b = 1 to MAX_LINE-1 do c = getc(fn) if c <= LEFT_HIGHLIGHT then if c = '\n' then buff[b] = c return buff[1..b] elsif c = EOF then if b = 1 then return EOF else buff[b] = '\n' return buff[1..b] end if elsif c >= RIGHT_HIGHLIGHT or c = 0 then c = '.' end if end if buff[b] = c end for buff[MAX_LINE] = '\n' return buff[1..MAX_LINE] end function function sum(sequence s) -- sum of a sequence atom sum sum = 0 for i = 1 to length(s) do sum = sum + s[i] end for return sum end function object line integer line_next boolean words_on_line sequence char_class -- 0 means not legitimate -- 1 means legitimate char -- > 1 means possible first char of matching word char_class = repeat(0, 255) char_class['A'..'Z'] = 1 char_class['a'..'z'] = 1 char_class['0'..'9'] = 1 char_class['_'] = 1 function has_punctuation(sequence word) -- TRUE if word contains any punctuation characters integer c for i = 1 to length(word) do c = word[i] if char_class[c] = 0 and c != '?' and c != '*' then return TRUE end if end for return FALSE end function function next_word() -- Return next possible matching word from line -- based on first letter of the word. sequence word integer c while TRUE do -- skip white space: while TRUE do c = line[line_next] line_next = line_next + 1 if char_class[c] > 0 then exit elsif c = '\n' then -- there's always a '\n' at end of line return -1 end if end while words_on_line = TRUE -- check first letter in word: if char_class[c] > 1 then -- possible matching word word = {c} -- read rest of word while TRUE do c = line[line_next] if char_class[c] = 0 then return word end if line_next = line_next + 1 word = word & c end while else -- not a possible matching word -skip it while TRUE do c = line[line_next] if char_class[c] = 0 then exit end if line_next = line_next + 1 end while end if end while end function sequence chunk_list chunk_list = {{-1, {}, {}}} integer worst_chunk, worst_score procedure highlight(sequence text) -- print a line with highlighted words in color integer c if not display then return end if for i = 1 to length(text) do c = text[i] if c = LEFT_HIGHLIGHT then text_color(HIGHLIGHT_COLOR) elsif c = RIGHT_HIGHLIGHT then text_color(WHITE) else puts(SCREEN, c) end if end for end procedure procedure print_chunk_list() -- print the best chunks found sequence chunk, line position(count_line, 1) for i = 1 to length(word_list) do both_printf("%s:%d ", {word_list[i], word_count[i]}) end for position(count_line+1, 1) puts(SCREEN, repeat(' ', 80)) puts(log_file, '\n') for i = 1 to length(chunk_list) - 1 do if i > 1 and display then text_color(GREEN) puts(SCREEN, "\nPress q to quit, Enter for more:") text_color(WHITE) puts(SCREEN, " ") if getc(0) = 'q' then display = FALSE end if end if text_color(RED) both_printf("\n#%d of %d ------ %s --- score: %d ------\n", {i, length(chunk_list)-1, chunk_list[i][2], 100 * chunk_list[i][1] + 0.5}) text_color(WHITE) chunk = chunk_list[i][3] wrap(FALSE) for j = 1 to length(chunk) do line = clean(chunk[j]) highlight(line) puts(log_file, line) end for wrap(TRUE) end for if length(chunk_list) > 1 then text_color(GREEN) puts(SCREEN, "\nSee " & log_path & '\n') end if text_color(WHITE) puts(SCREEN, " \n") end procedure procedure save_chunk(sequence file_name, sequence chunk, atom score) -- record an interesting chunk on the chunk list score = score / (10 + sqrt(length(chunk))) -- reduce slightly for larger chunks for i = 1 to length(chunk_list) do if score > chunk_list[i][1] then -- insert chunk into list at proper position chunk_list = append(chunk_list[1..i-1], {score, file_name, chunk}) & chunk_list[i..length(chunk_list)] if length(chunk_list) > MAX_CHUNKS+1 then -- drop the worst chunk on the list chunk_list = chunk_list[1..length(chunk_list)-1] end if exit end if end for end procedure sequence wild_word procedure scan(sequence file_name) -- read next file integer fileNum, first_hit, last_hit, new_chunk sequence lword, chunk, word_value object word atom chunk_total, line_total boolean doc_file, matched, first_match fileNum = open(file_name, "rb") if fileNum = -1 then return end if -- is it a Euphoria .doc file? doc_file = euphoria and match(".DOC", file_name) -- update display wrap(FALSE) position(count_line, 1) for i = 1 to length(word_list) do printf(SCREEN, "%s:%d ", {word_list[i], word_count[i]}) end for position(count_line+1, 1) puts(SCREEN, "searching: " & file_name & repeat(' ', 80) & '\r') wrap(TRUE) new_chunk = TRUE while TRUE do -- initialize if new_chunk then chunk = {} chunk_total = 0 first_hit = 0 last_hit = 0 new_chunk = FALSE word_value = repeat(1, length(word_list)) end if line_next = 1 line_total = 0 -- read next line line = safe_gets(fileNum) if atom(line) then exit -- end of file end if words_on_line = FALSE while TRUE do -- read next word in line word = next_word() if atom(word) then exit end if lword = lower(word) first_match = TRUE for i = 1 to length(word_list) do if wild_word[i] then -- slow matched = wildcard_match(word_list[i], lword) else -- fast matched = compare(word_list[i], lword) = 0 end if if matched then -- score a bit higher for matching a non-wildcard word line_total = line_total + word_value[i] * (1.0 + 0.5 * (match(separator_line, line) != 0) + 0.3 * (not wild_word[i]) + 0.3 * doc_file) word_count[i] = word_count[i] + 1 word_value[i] = word_value[i] / 2 if first_match then first_match = FALSE line = line[1..line_next - length(word) - 1] & LEFT_HIGHLIGHT & word & RIGHT_HIGHLIGHT & line[line_next..length(line)] line_next = line_next + 2 end if end if end for end while chunk = append(chunk, line) -- decide chunk boundaries if words_on_line then if line_total > 0 then chunk_total = chunk_total + line_total last_hit = length(chunk) if first_hit = 0 then first_hit = last_hit end if end if if chunk_total > 0 then if (line_total = 0 and last_hit < length(chunk) - MIN_CHUNK_SIZE/2 and length(chunk) >= MIN_CHUNK_SIZE) or length(chunk) >= MAX_CHUNK_SIZE then if length(chunk) <= MIN_CHUNK_SIZE then first_hit = 1 last_hit = length(chunk) else -- trim off some context, but not all first_hit = floor((first_hit + 1) / 2) last_hit = floor((last_hit + length(chunk)) / 2) end if save_chunk(file_name, chunk[first_hit..last_hit], chunk_total) new_chunk = TRUE end if elsif length(chunk) >= MIN_CHUNK_SIZE then new_chunk = TRUE end if elsif chunk_total = 0 and length(chunk) > MIN_CHUNK_SIZE/2 then new_chunk = TRUE end if end while if chunk_total > 0 then save_chunk(file_name, chunk, chunk_total) end if close(fileNum) return end procedure procedure look_at(sequence path_name, sequence entry) -- see if a file name qualifies for searching sequence file_name file_name = entry[D_NAME] if compare(file_name, log_name) = 0 then return -- avoid circularity end if -- check skip list for i = 1 to length(skip_list) do if wildcard_file(skip_list[i], file_name) then return end if end for path_name = path_name & '\\' if compare(path_name[1..2], ".\\") = 0 then path_name = path_name[3..length(path_name)] end if path_name = path_name & file_name scan(path_name) end procedure procedure walk_dir(sequence path_name) -- walk through a directory and its subdirectories -- "looking" at each file object d integer key d = dir(path_name) while find(path_name[length(path_name)], " \\") do path_name = path_name[1..length(path_name)-1] end while if atom(d) then return end if d = sort(d) for i = 1 to length(d) do if find('d', d[i][D_ATTRIBUTES]) then if not find(d[i][D_NAME], {".", ".."}) then walk_dir(path_name & '\\' & d[i][D_NAME]) end if else look_at(path_name, d[i]) end if key = get_key() if key = 'q' then print_chunk_list() abort(1) end if end for end procedure procedure usage(sequence g) text_color(MAGENTA) puts(SCREEN, "\n\t\t" & g & " Guru\n\n") text_color(WHITE) puts(SCREEN, "Enter keywords that will define the subject you are interested in. \n") puts(SCREEN, " - Upper/lower case is not important.\n") puts(SCREEN, " - Words may contain * and ? wildcard characters,\n") puts(SCREEN, " - example ---> get? input *routine*\n\n") puts(SCREEN, "---> ") end procedure function blank_delim(sequence s) -- break up a blank-delimited string sequence list, segment integer i list = {} i = 1 while i < length(s) do while find(s[i], " \t") do i = i + 1 end while if s[i] = '\n' then exit end if segment = "" while not find(s[i], " \t\n") do segment = segment & s[i] i = i + 1 end while list = append(list, segment) end while return list end function log_name = upper(log_name) sequence cmd cmd = command_line() -- ex guru.ex words... euphoria = FALSE if length(cmd) < 3 then usage("Current Directory") cmd = blank_delim(gets(0)) puts(SCREEN, '\n') elsif compare(cmd[3], "E!") = 0 then -- search Euphoria directories euphoria = TRUE if length(cmd) <= 3 then usage("Euphoria") cmd = blank_delim(gets(0)) puts(SCREEN, '\n') else cmd = cmd[4..length(cmd)] end if else cmd = cmd[3..length(cmd)] end if log_file = open(log_path, "w") if log_file = -1 then puts(ERR, "Couldn't open " & log_path & '\n') abort(1) end if word_list = {} wild_word = {} for i = 1 to length(cmd) do cmd[i] = lower(cmd[i]) if find(cmd[i], noise_words) then puts(SCREEN, "ignoring: " & cmd[i] & " (too common)\n") elsif has_punctuation(cmd[i]) then puts(SCREEN, "ignoring: " & cmd[i] & " (contains punctuation character)\n") else word_list = append(word_list, cmd[i]) wild_word = append(wild_word, find('*', cmd[i]) or find('?', cmd[i])) end if end for if length(word_list) = 0 then abort(1) end if word_count = repeat(0, length(word_list)) integer first_char -- prepare char_class[] for efficient detection of a -- possible first letter in one of the words for i = 1 to length(word_list) do first_char = word_list[i][1] if first_char = '*' or first_char = '?' then char_class = char_class * 2 -- select all allowed chars exit elsif char_class[first_char] > 0 then char_class[first_char] = 2 -- select upper or lower case if first_char >= 'A' and first_char <= 'Z' then char_class[first_char - 'A' + 'a'] = 2 elsif first_char >= 'a' and first_char <= 'z' then char_class[first_char - 'a' + 'A'] = 2 end if end if end for file_spec = {"*.*"} -- quits after finishing current file puts(SCREEN, "Press q to quit\n\n\n") sequence gp gp = get_position() count_line = gp[1]-1 object d if euphoria then d = getenv("EUDIR") if atom(d) then d = "C:\\EUPHORIA" end if if sequence(dir(d)) then -- reduce noise in Euphoria Help skip_list = skip_list & {"*.BAS", "*.BAT", "LW.DOC", "BIND.EX", "EX.ERR"} walk_dir(d) print_chunk_list() abort(0) end if end if puts(log_file, "Searching " & current_dir() & "\n\n") if sequence(dir(".")) then walk_dir(".") else walk_dir(current_dir()) end if print_chunk_list() without warning